These COVID-19 datasets are collected and maintained through a subsection of Johns Hopkins University. Each day, the total number of confirmed COVID-19 cases and deaths is updated for each country in the world, and each of the states in the US, to provide accurate and updated time series data on the proliferation of the virus. The datasets can be found at the following URL: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
library(tidyverse)
library(lubridate)
library(dplyr)
library(leaps)
library(plotly)
library(usmap)
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
file_names <- c("time_series_covid19_confirmed_US.csv",
"time_series_covid19_deaths_US.csv",
"time_series_covid19_confirmed_global.csv",
"time_series_covid19_deaths_global.csv")
urls <- str_c(url_in, file_names)
#urls
us_cases <- read_csv(urls[1])
us_deaths <- read_csv(urls[2])
global_cases <- read_csv(urls[3])
global_deaths <- read_csv(urls[4])
#Tidy the data so that each date is on a separate row
global_cases <- global_cases %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "cases") %>%
select(-c(Lat, Long))
#global_cases
global_deaths <- global_deaths %>%
pivot_longer(cols = -c(`Province/State`, `Country/Region`, Lat, Long),
names_to = "date",
values_to = "deaths") %>%
select(-c(Lat, Long))
#global_deaths
global <- global_cases %>%
full_join(global_deaths) %>%
rename(Country_Region = `Country/Region`,
Province_State = `Province/State`) %>%
mutate(date = mdy(date))
#global
#summary(global)
global <- global %>%
filter(cases > 0)
#summary(global)
us_cases <- us_cases %>%
pivot_longer(cols = -(UID:Combined_Key),
names_to = "date",
values_to = "cases") %>%
select(Admin2:cases) %>%
mutate(date = mdy(date)) %>%
select(-c(Lat, Long_))
#us_cases
us_deaths <- us_deaths %>%
pivot_longer(cols = -(UID:Population),
names_to = "date",
values_to = "deaths") %>%
select(Admin2:deaths) %>%
mutate(date = mdy(date)) %>%
select(-c(Lat, Long_))
#us_deaths
US <- us_cases %>%
full_join(us_deaths)
#US
global <- global %>%
unite("Combined_Key",
c(Province_State, Country_Region),
sep = ", ",
na.rm = TRUE,
remove = FALSE)
#global
uid_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
uid <- read_csv(uid_url) %>%
select(-c(Lat, Long_, Combined_Key, code3, iso2, iso3, Admin2))
global <- global %>%
left_join(uid, by = c("Province_State", "Country_Region")) %>%
select(-c(UID, FIPS)) %>%
select(Province_State, Country_Region, date,
cases, deaths, Population, Combined_Key)
#global
US_by_state <- US %>%
group_by(Province_State, Country_Region, date) %>%
summarize(cases = sum(cases), deaths = sum(deaths), Population = sum(Population)) %>%
mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population) %>%
ungroup()
#US_by_state
US_totals <- US_by_state %>%
group_by(Country_Region, date) %>%
summarize(cases = sum(cases), deaths = sum(deaths), Population = sum(Population)) %>%
mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
select(Country_Region, date, cases, deaths) %>%
ungroup()
#US_totals
US_totals %>%
filter(cases > 0 ) %>%
ggplot(aes(x = date, y = cases)) +
geom_line(aes(color = "cases")) +
geom_point(aes(color = "cases")) +
geom_line(aes(y = deaths, color = "deaths")) +
geom_point(aes(y = deaths, color = "deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID-19 in the US", y = "Total", x = "Date")
The COVID-19 cases and deaths in the United States both continue to rise over time, however, it becomes more difficult to tell after a certain point because of the logarithmic scale of the graph. A more digestible way to examine the COVID-19 case data is to instead look at the number of new cases and new deaths each day, rather than the aggregate cases and deaths.
state = "Virginia"
US_by_state %>%
filter(Province_State == state ) %>%
filter(cases > 0 ) %>%
ggplot(aes(x = date, y = cases)) +
geom_line(aes(color = "cases")) +
geom_point(aes(color = "cases")) +
geom_line(aes(y = deaths, color = "deaths")) +
geom_point(aes(y = deaths, color = "deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID19 in Virginia", y = NULL)
Similar to the national data, the cases and deaths both continue to rise in the state of Virginia. Again, we will leverage the use of new cases and deaths per day to receive a better understanding of whether the spread of COVID-19 is accelerating, decelerating, or holding steady.
max(US_totals$date)
## [1] "2021-06-20"
max(US_totals$deaths)
## [1] 601824
US_by_state <- US_by_state %>%
mutate(new_cases = cases - lag(cases),
new_deaths = deaths - lag(deaths))
US_totals <- US_totals %>%
mutate(new_cases = cases -lag(cases),
new_deaths = deaths - lag(deaths))
tail(US_totals %>% select(new_cases, new_deaths, everything()))
## # A tibble: 6 x 6
## new_cases new_deaths Country_Region date cases deaths
## <dbl> <dbl> <chr> <date> <dbl> <dbl>
## 1 11304 308 US 2021-06-15 33486038 600329
## 2 12430 368 US 2021-06-16 33498468 600697
## 3 10399 281 US 2021-06-17 33508867 600978
## 4 20608 593 US 2021-06-18 33529475 601571
## 5 8520 170 US 2021-06-19 33537995 601741
## 6 3892 83 US 2021-06-20 33541887 601824
US_totals %>%
ggplot(aes(x = date, y = new_cases)) +
geom_line(aes(color = "new_cases")) +
geom_point(aes(color = "new_cases")) +
geom_line(aes(y = new_deaths, color = "new_deaths")) +
geom_point(aes(y = new_deaths, color = "new_deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID19 in the US", y = "Totals", x = "Date")
We see that the number of new cases and new deaths per day has been decreasing since the beginning of the 2021 calendar year. Despite this, the new COVID-19 deaths in the US is still close to 1000 per day, and the new COVID-19 cases is around 10,000 a day. The spread of the virus in the US appears to be lessening but it is still spreading rapidly.
state = "Virginia"
US_by_state %>%
filter(Province_State == state ) %>%
filter(cases > 0 ) %>%
ggplot(aes(x = date, y = new_cases)) +
geom_line(aes(color = "new_cases")) +
geom_point(aes(color = "new_cases")) +
geom_line(aes(y = new_deaths, color = "new_deaths")) +
geom_point(aes(y = new_deaths, color = "new_deaths")) +
scale_y_log10() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 90)) +
labs(title = "COVID19 in Virginia", y = NULL)
Specifically in Virginia, the volume of new daily cases at the beginning of the year was cresting at nearly 10000, and now it is all the way down to only 100 new cases per day. Likewise, the new daily deaths has dropped to less than 10, so it appears that Virginia is making good headway towards curbing COVID-19.
US_state_totals <- US_by_state %>%
group_by(Province_State) %>%
summarize(deaths = max(deaths), cases = max(cases),
population = max(Population),
cases_per_thou = 1000 * cases / population,
deaths_per_thou = 1000 * deaths / population) %>%
filter(cases > 0, population > 0)
US_state_totals %>%
slice_min(deaths_per_thou, n = 10)
## # A tibble: 10 x 6
## Province_State deaths cases population cases_per_thou deaths_per_thou
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Northern Mariana Isl~ 2 183 55144 3.32 0.0363
## 2 Virgin Islands 30 3752 107268 35.0 0.280
## 3 Hawaii 513 37353 1415872 26.4 0.362
## 4 Vermont 256 24360 623989 39.0 0.410
## 5 Alaska 373 70875 740995 95.6 0.503
## 6 Maine 854 68826 1344212 51.2 0.635
## 7 Oregon 2754 206777 4217737 49.0 0.653
## 8 Puerto Rico 2540 139753 3754939 37.2 0.676
## 9 Utah 2330 411610 3205958 128. 0.727
## 10 Washington 5856 447203 7614893 58.7 0.769
US_state_totals %>%
select(deaths_per_thou, cases_per_thou, everything())
## # A tibble: 55 x 6
## deaths_per_thou cases_per_thou Province_State deaths cases population
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2.31 112. Alabama 11306 548657 4903185
## 2 0.503 95.6 Alaska 373 70875 740995
## 3 2.45 122. Arizona 17843 889727 7278717
## 4 1.95 115. Arkansas 5874 345605 3017804
## 5 1.60 96.4 California 63335 3808258 39512223
## 6 1.17 96.2 Colorado 6732 553868 5758736
## 7 2.32 97.8 Connecticut 8270 348665 3565287
## 8 1.73 112. Delaware 1683 109548 973764
## 9 1.62 69.8 District of Columbia 1141 49243 705749
## 10 1.75 110. Florida 37555 2354416 21477737
## # ... with 45 more rows
mod <- lm(deaths_per_thou ~ cases_per_thou, data = US_state_totals)
summary(mod)
##
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou, data = US_state_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41872 -0.21743 -0.02948 0.20909 1.04943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.013040 0.212328 -0.061 0.951
## cases_per_thou 0.016812 0.002123 7.918 1.51e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4657 on 53 degrees of freedom
## Multiple R-squared: 0.5419, Adjusted R-squared: 0.5333
## F-statistic: 62.7 on 1 and 53 DF, p-value: 1.511e-10
mod2 <- lm(deaths_per_thou ~ cases_per_thou + population, data = US_state_totals)
summary(mod2)
##
## Call:
## lm(formula = deaths_per_thou ~ cases_per_thou + population, data = US_state_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.35691 -0.25883 -0.00448 0.22692 1.01691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.254e-02 2.094e-01 -0.251 0.803
## cases_per_thou 1.626e-02 2.106e-03 7.719 3.55e-10 ***
## population 1.530e-08 8.686e-09 1.762 0.084 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4567 on 52 degrees of freedom
## Multiple R-squared: 0.5677, Adjusted R-squared: 0.5511
## F-statistic: 34.15 on 2 and 52 DF, p-value: 3.386e-10
anova(mod, mod2)
## Analysis of Variance Table
##
## Model 1: deaths_per_thou ~ cases_per_thou
## Model 2: deaths_per_thou ~ cases_per_thou + population
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 53 11.493
## 2 52 10.845 1 0.6475 3.1045 0.08395 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
US_state_totals %>%
slice_min(cases_per_thou)
## # A tibble: 1 x 6
## Province_State deaths cases population cases_per_thou deaths_per_thou
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Northern Mariana Islan~ 2 183 55144 3.32 0.0363
US_state_totals %>%
slice_max(cases_per_thou)
## # A tibble: 1 x 6
## Province_State deaths cases population cases_per_thou deaths_per_thou
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North Dakota 1554 110567 762062 145. 2.04
US_state_totals %>%
mutate(pred = predict(mod))
## # A tibble: 55 x 7
## Province_State deaths cases population cases_per_thou deaths_per_thou pred
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 11306 5.49e5 4903185 112. 2.31 1.87
## 2 Alaska 373 7.09e4 740995 95.6 0.503 1.60
## 3 Arizona 17843 8.90e5 7278717 122. 2.45 2.04
## 4 Arkansas 5874 3.46e5 3017804 115. 1.95 1.91
## 5 California 63335 3.81e6 39512223 96.4 1.60 1.61
## 6 Colorado 6732 5.54e5 5758736 96.2 1.17 1.60
## 7 Connecticut 8270 3.49e5 3565287 97.8 2.32 1.63
## 8 Delaware 1683 1.10e5 973764 112. 1.73 1.88
## 9 District of Co~ 1141 4.92e4 705749 69.8 1.62 1.16
## 10 Florida 37555 2.35e6 21477737 110. 1.75 1.83
## # ... with 45 more rows
US_total_w_pred <- US_state_totals %>%
mutate(pred = predict(mod), pred2 = predict(mod2), std_ratio = ((deaths_per_thou / cases_per_thou) - (mean(deaths_per_thou) / mean(cases_per_thou))) / sd(deaths_per_thou / cases_per_thou))
US_total_w_pred
## # A tibble: 55 x 9
## Province_State deaths cases population cases_per_thou deaths_per_thou pred
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 11306 5.49e5 4903185 112. 2.31 1.87
## 2 Alaska 373 7.09e4 740995 95.6 0.503 1.60
## 3 Arizona 17843 8.90e5 7278717 122. 2.45 2.04
## 4 Arkansas 5874 3.46e5 3017804 115. 1.95 1.91
## 5 California 63335 3.81e6 39512223 96.4 1.60 1.61
## 6 Colorado 6732 5.54e5 5758736 96.2 1.17 1.60
## 7 Connecticut 8270 3.49e5 3565287 97.8 2.32 1.63
## 8 Delaware 1683 1.10e5 973764 112. 1.73 1.88
## 9 District of Co~ 1141 4.92e4 705749 69.8 1.62 1.16
## 10 Florida 37555 2.35e6 21477737 110. 1.75 1.83
## # ... with 45 more rows, and 2 more variables: pred2 <dbl>, std_ratio <dbl>
US_total_w_pred %>%
ggplot() +
geom_point(aes(x = cases_per_thou, y = deaths_per_thou), color = "blue") +
geom_point(aes(x = cases_per_thou, y = pred), color = "red") +
geom_point(aes(x = cases_per_thou, y = pred2), color = "green")
We can create a linear regression model with deaths_per_thou as the response and cases_per_thou as the sole predictor. The F-statistic for the model is statistically significant at the alpha level of 0.05, so we can conclude that the model is a better fit to the data than the null model would be. The p-value for the predictor is also significant at the alpha level of 0.05, indicating that it should be included in the model. Using the linear model, we can make predictions for the fitted values of the response for different values of the predictor. By juxtaposing the fitted values (in red) and the actual values (in blue), it is evident that while there is certainly a linear trend in the actual data, a large portion of the variability in the response is unexplained by our simple linear regression model. This can also be seen analytically by examining the adjusted \(R^2\) value, which is ~0.53.
Creating a second linear model and adding in a second predictor, population, could potentially help model the response more accurately. While this model does have a lower residual sum of squares (RSS) than the one predictor model, it is known that adding predictors will always decrease the RSS. To check if this new model is better, we perform an F-test on the two models using the anova function. Since the F-statistic of 3.1013 yields a p-value of 0.084, we fail to reject the null hypothesis that the two models are statistically different at the alpha = 0.05 level. When two models offer similar explanatory power, we opt to select the more parsimonious model to eschew needless complexity. This means we should select the single predictor model as the better model of the two.
Plotting the second model’s fitted values (in green) against the actual response values (still in blue) graphically looks like a much better model fit. While the second model is statistically better at even the alpha = 0.1 level, it would be unethical to select or change the critical level after running analysis; the pre-selected alpha value must be maintained to preserve scientific integrity.
fig <- plot_ly(US_total_w_pred, x = ~cases_per_thou, y = ~deaths_per_thou, text = ~Province_State, type = 'scatter', mode = 'markers', color = ~population,
marker = list(opacity = 0.7))
fig <- fig %>% layout(title = 'COVID-19 Cases and Deaths by US State',
xaxis = list(showgrid = FALSE),
yaxis = list(showgrid = FALSE)) #%>%
#add_trace(
#type = 'scatter',
#hovertemplate = paste('<i>Cases Per Thou</i>: %{x:.2f}',
#'<br><i>Deaths Per Thou</i>: %{y:.2f}<br>',
#'<b>%{:Country_Region}</b><extra></extra>')
#)
fig
To get a better picture of what the data for each individual state looks like, we can use plotly to get hovertext to show up over each data point. This enables us to see the name of the state, the cases_per_thou, the deaths_per_thou, and the population can be estimated by the colorscale shown in the legend. Interestingly, most of the states with both the lowest and the highest cases_per_thou are low in total population. The data suggests that there are some other important factors at play that are not fully encapsulated by the dataset.
US_chloropleth <- US_state_totals %>%
mutate(state = Province_State)
plot_usmap(data = US_chloropleth,
values = "deaths_per_thou") +
scale_fill_gradient(name = "Deaths per thousand",
low = "green", high = "red") +
theme(legend.position = "right") +
labs(title = "COVID-19 Deaths Per Thousand People in the US")
plot_usmap(data = US_chloropleth,
values = "cases_per_thou") +
scale_fill_gradient(name = "Cases per thousand",
low = "green", high = "red") +
theme(legend.position = "right") +
labs(title = "COVID-19 Cases Per Thousand People in the US")
One question of interest is what is the effect of geographic location on the severity of COVID-19 cases and deaths? Are there clusters of states that have similar COVID-19 rates? To check on this, we can construct a chloropleth for both cases_per_thou and deaths_per_thou to see if there are any pictorially discernible patterns. The cases_per_thou chloropleth shows two clusters in the Northwest and the Northeast with a low number cases and then one cluster in the upper Midwest with a high number of cases. The deaths_per_thou chloropleth shows low death clusters in the same places again, but the highest death cluster is in the lower Northeast this time and the second highest death cluster is the entire line of Southern states. Both chloropleths strongly suggest that the geographical aspect of the COVID-19 spread is relevant to some degree.
COVID-19 is a very serious pandemic and the more we can understand about how and why it spreads, the better we can fight against its proliferation. The collected data gives us some insight on how the population of an area and the number of cases per thousand citizens can effect the number of deaths per thousand citizens for that area. I chose to focus my statistical modeling on the US data and used linear regression models to procure predictions for the deaths per thousand for each state. However, the linear models available to explain the response given the predictors available in the data could be drastically improved. For the purposes of doing a deeper analysis, a few predictors that could be a boon to be able to add to the models at the state level would be population density, political view, lockdown restrictions, testing rate, daily vaccinations, and temperature.
One large potential source of bias in the data collection is how the states elect to report confirmed cases and deaths. Depending on their individual infrastructure for data collection, reports could be slow, incomplete, or misrecorded. Another important factor that can bias the data of the COVID-19 cases would be the rate at which individuals are getting tested, both asymptomatically and after they feel ill. The true population parameter of the infected people could be dramatically different if sick individuals are getting officially recorded in the system, leading to a significant bias in the direction of underestimation. A source of potential personal bias is that I chose to focus on the US data for modeling and largely ignored the global data, since I am from the United States. One final bias source is the alpha level of 0.05 that I chose before starting to analyze; I picked the default value because I do not have knowledge about virology so it is possible that the typical value for this field of study should be different. There could have been different conclusion to draw when examining a more eclectic source of COVID-19 data that I missed out on from analyzing from a domestic point of view. The data shows that COVID-19 is still very potent but on the decline, so hopefully by continuing to employ safety measures and distributing vaccines the brunt of the pandemic will soon be behind us.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8.1 x64 (build 9600)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] usmap_0.5.2 plotly_4.9.3 leaps_3.1 lubridate_1.7.10
## [5] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.5 purrr_0.3.4
## [9] readr_1.4.0 tidyr_1.1.3 tibble_3.1.1 ggplot2_3.3.3
## [13] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 assertthat_0.2.1 digest_0.6.27 utf8_1.2.1
## [5] R6_2.5.0 cellranger_1.1.0 backports_1.2.1 reprex_2.0.0
## [9] evaluate_0.14 httr_1.4.2 highr_0.9 pillar_1.6.0
## [13] rlang_0.4.10 lazyeval_0.2.2 curl_4.3 readxl_1.3.1
## [17] rstudioapi_0.13 data.table_1.14.0 jquerylib_0.1.3 rmarkdown_2.7
## [21] labeling_0.4.2 htmlwidgets_1.5.3 munsell_0.5.0 broom_0.7.6
## [25] compiler_3.6.1 modelr_0.1.8 xfun_0.22 pkgconfig_2.0.3
## [29] htmltools_0.5.1.1 tidyselect_1.1.0 fansi_0.4.2 viridisLite_0.4.0
## [33] crayon_1.4.1 dbplyr_2.1.1 withr_2.4.2 grid_3.6.1
## [37] jsonlite_1.7.2 gtable_0.3.0 lifecycle_1.0.0 DBI_1.1.1
## [41] magrittr_2.0.1 scales_1.1.1 cli_2.4.0 stringi_1.5.3
## [45] farver_2.1.0 fs_1.5.0 xml2_1.3.2 bslib_0.2.4
## [49] ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.7 tools_3.6.1
## [53] glue_1.4.2 crosstalk_1.1.1 hms_1.0.0 yaml_2.2.1
## [57] colorspace_2.0-0 rvest_1.0.0 knitr_1.33 haven_2.4.0
## [61] sass_0.3.1